Project Description

This document is part of the Explorary Data Analysis (EDA) for a kaggle project aiming to improve the “Zestimates” (home value) prediction algorithm used by Zillow. I loaded the raw data from the following project page, and conducted data visualization to facilitate model fitting

https://www.kaggle.com/c/zillow-prize-1

Load the data

rm(list=ls())
library(lattice)
library(ggplot2)
library(rmutil)
library(corrplot)
library(leaflet)
library(plotly)
library(GGally)
library(knitr)
library(shiny)
train <- read.csv("train_property.csv", stringsAsFactors = F)

Extract transaction year, transaction month, transaction day

Generate time related features

train$trans_year <- sapply(strsplit(train$transactiondate, '-'), '[', 1) 
train$trans_month <- sapply(strsplit(train$transactiondate, '-'), '[', 2)
train$trans_day <- sapply(strsplit(train$transactiondate, '-'), '[', 3)
train$trans_weekday <- weekdays(as.Date(train$transactiondate))
train$trans_DATE <- as.Date(train$transactiondate)

Rename the variables to be more descriptive

train <- plyr::rename(train,
                     c("parcelid"="id_parcel",
                       "transactiondate" = "trans_date",
                       "yearbuilt" = "build_year",
                       "basementsqft"="area_base_living",
                       "yardbuildingsqft17"="area_patio",
                       "yardbuildingsqft26"="area_shed",
                       "poolsizesum"="area_pool",
                       "lotsizesquarefeet"="area_lot",
                       "garagetotalsqft"="area_garage",
                       "finishedfloor1squarefeet" = "area_firstfloor_finished",
                       "calculatedfinishedsquarefeet" = "area_total_calc",
                       "finishedsquarefeet6" = "area_base",
                       "finishedsquarefeet12" = "area_live_finished",
                       "finishedsquarefeet13" = "area_liveperi_finished",
                       "finishedsquarefeet15" = "area_total_finished",
                       "finishedsquarefeet50" = "area_unknown",
                       "unitcnt" = "num_unit",
                       "numberofstories" = "num_story",
                       "roomcnt" = "num_room",
                       "bathroomcnt" = "num_bathroom",
                       "bedroomcnt" = "num_bedroom",
                       "calculatedbathnbr" = "num_bathroom_calc",
                       "fullbathcnt" = "num_bath",
                       "threequarterbathnbr" = "num_75_bath",
                       "fireplacecnt" = "num_fireplace",
                       "poolcnt" = "num_pool",
                       "garagecarcnt" = "num_garage",
                       "regionidcounty" = "region_county",
                       "regionidcity" = "region_city",
                       "regionidzip" = "region_zip",
                       "regionidneighborhood" = "region_neighbor",
                       "taxvaluedollarcnt" = "tax_total",
                       "structuretaxvaluedollarcnt" = "tax_building",
                       "landtaxvaluedollarcnt" = "tax_land",
                       "taxamount" = "tax_property",
                       "assessmentyear" = "tax_year",
                       "taxdelinquencyflag" = "tax_delinquency",
                       "taxdelinquencyyear" = "tax_delinquency_year",
                       "propertyzoningdesc" = "zoning_property",
                       "propertylandusetypeid" = "zoning_landuse",
                       "propertycountylandusecode" = "zoning_landuse_county",
                       "fireplaceflag" = "flag_fireplace",
                       "hashottuborspa" = "flag_tub",
                       "buildingqualitytypeid" = "quality",
                       "buildingclasstypeid" = "framing",
                       "typeconstructiontypeid" = "material",
                       "decktypeid" = "deck",
                       "storytypeid" = "story",
                       "heatingorsystemtypeid" = "heating",
                       "airconditioningtypeid" = "aircon",
                       "architecturalstyletypeid" = "architectural_style",
                       "pooltypeid10" = "flag_spa",
                       "pooltypeid2" = "flag_pool_spa",
                       "pooltypeid7" = "flag_pool_tub",
                       "fips"="county"))

An overview of data type

# numerical variable
variable_numeric = c("area_firstfloor_finished",
                     "area_base", "area_base_living",
                     "area_garage", 
                     "area_live_finished",
                     "area_liveperi_finished",
                     "area_lot",
                     "area_patio",
                     "area_pool",
                     "area_shed",
                     "area_total_calc",
                     "area_total_finished",
                     "area_unknown",
                     "tax_building",
                     "tax_land",
                     "tax_property",
                     "tax_total",
                     "latitude",
                     "longitude")
# discrete
variable_discrete = c("num_75_bath",
                      "num_bath",
                      "num_bathroom",
                      "num_bathroom_calc",
                      "num_bedroom",
                      "num_fireplace",
                      "num_garage",
                      "num_pool",
                      "num_room",
                      "num_story",
                      "num_unit")

variable_binary = c("flag_fireplace",
                    "flag_tub",
                    "flag_spa",
                    "flag_pool_spa",
                    "flag_pool_tub",
                    "tax_delinquency")

# categorical variable
variable_nominal = c("aircon",
                     "architectural_style",
                     "county",
                     "deck",
                     "framing",
                     "heating",
                     "id_parcel",
                     "material",
                     "region_city",
                     "region_county",
                     "region_neighbor",
                     "region_zip",
                     "story",
                     "zoning_landuse",
                     "zoning_landuse_county")

variable_ordinal = c("quality")

# date
variable_date = c("tax_year",
                  "build_year",
                  "tax_delinquency_year",
                  "trans_year",
                  "trans_month",
                  "trans_day",
                  "trans_date",
                  "trans_weekday")

# others
variable_unstruct = c("zoning_property")

# don't understand
variable_unknown = c('censustractandblock',
                     'rawcensustractandblock')

Reasonable conversion

# convert binary to 0, 1

train[train$flag_fireplace == "", "flag_fireplace"] = 0
train[train$flag_fireplace == "true", "flag_fireplace"] = 1
train[train$flag_tub == "", "flag_tub"] = 0
train[train$flag_tub == "true", "flag_tub"] = 1
train[train$tax_delinquency == "", "tax_delinquency"] = 0
train[train$tax_delinquency == "Y", "tax_delinquency"] = 1

# convert to date to int
train[,variable_date] = sapply(train[,variable_date], as.character)

# convert to numeric to double
train[,variable_numeric] = sapply(train[,variable_numeric], as.numeric)

# convert to discrete to int
train[,c(variable_discrete, variable_binary)] = sapply(train[,c(variable_discrete, variable_binary)], as.integer)

# convert to categorical to character
train[,c(variable_nominal, variable_ordinal)] = sapply(train[,c(variable_nominal, variable_ordinal)], as.character)

Redundant features

1. county & region_county

par(mfrow=c(1,2)) #plot in 1 figure with 2 subplot(1 row, 2 col)
barplot(sort(table(train$county))) # sorted barplot
barplot(sort(table(train$region_county)))

2.num_bathroom, num_bathroom_calc

df = train[!is.na(train$num_bathroom_calc),
           c("num_bathroom","num_bathroom_calc")]

ggplot(data = df,
       aes(x=num_bathroom,
           y=num_bathroom_calc)) + geom_point() + theme_minimal()

sum(df$num_bathroom == df$num_bathroom_calc) == nrow(df)
## [1] TRUE

3. num_bathroom, num_bath

df = train[!is.na(train$num_bath),
           c("num_bathroom","num_bath")]

ggplot(data = df,
       aes(x=num_bath,
           y=num_bathroom)) + geom_point() + theme_minimal()

sum(df$num_bath == df$num_bathroom)/nrow(df)
## [1] 0.9989113

Missing data overview

num.NA <- sort(colSums(sapply(train, is.na)))

dfnum.NA <- data.frame(ind = c(1:length(num.NA)),
                       percentage = num.NA/nrow(train),
                       per80 = num.NA/nrow(train)>=0.2,
                       name = names(num.NA),
                       row.names = NULL) # convert to data.frame

ggplot(data = dfnum.NA, aes(x=ind, y=percentage)) + 
  geom_bar(aes(fill=per80), stat="identity") + 
  scale_x_discrete(name ="column names", 
                   limits=dfnum.NA$name)+
  theme(axis.text.x = element_text(angle=90, hjust=1, vjust=.5),
        legend.position = "none") +
  geom_hline(yintercept = 0.2) + 
  ggtitle("Percentage of Missing")

Missing column similarity

# overall missing similarity

MissSim <- function(X, feature){
  
  na.df <- as.data.frame(is.na(X)) # convert to na.df
  # Jaccard dist: 0-1, 0 means similar, 1 means not similar
  na.similarity <- as.data.frame(as.matrix(proxy::dist(t(na.df),
                                                       method = "jaccard",
                                                       diag = T, upper = T))) # calculate similarity between columns
  na.mat = as.matrix(na.similarity[order(na.similarity[feature,]), order(na.similarity[feature,])]) # ordered na.similarity, then convert to matrix

  
  plot_ly(x=colnames(na.mat),
          y=rownames(na.mat),
          z = na.mat,
          type = "heatmap")
}

MissSim(train, "area_base")

It seams some features are not missing randomly +missing value of binary +missing value of discrete +missing value of nominal +missing value of numeric

# missing value of binary
MissSim(train[, variable_binary], "flag_fireplace")
# missing value of discrete
MissSim(train[, variable_discrete], "num_story")
# missing value of nominal
MissSim(train[, variable_nominal], "story")
# missing value of numeric
MissSim(train[, variable_numeric], "area_base")

Outcome overview: logerror

ggplot(data = train, aes(x=logerror)) + 
  geom_histogram(aes(y=..count../sum(..count..)),
                 bins = 400,
                 fill="red",
                 color="black") +
  theme_minimal()+
  theme(axis.title = element_text(size=16),
        axis.text = element_text(size=14))+
  ylab("Count")+coord_cartesian(x=c(-0.5,0.5))

Distribution of logerror:

ggplot(train, aes(sample = logerror), shape = 21, alpha = 0.8) +
  stat_qq() + theme_minimal()

A reminder of Laplace distribution

logerror_ecdf = ecdf(train$logerror)
x = sort(train$logerror)
cdfl = plaplace(x, m = mean(train$logerror) ,s=sd(train$logerror)*0.4)
qq = data.frame(cdfl = cdfl, ecdf = logerror_ecdf(x))
ggplot(data = qq, aes(x=cdfl, y=ecdf)) + geom_line() + theme_minimal()

Consider using absolute value of logerror

train$abs_logerror <- abs(train$logerror)
ggplot(data=train, aes(x=abs_logerror)) + 
  geom_histogram(bins=400,
                 fill="red", color="black")+
  theme_bw()+theme(axis.title = element_text(size=16),
                   axis.text = element_text(size=14))+
  ylab("Count")+coord_cartesian(x=c(0,0.5))

abs_error.DATE = by(train, train$trans_DATE, function(x){return(mean(x$abs_logerror))})
error.DATE = by(train, train$trans_DATE, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.DATE, type="b") + title("date v.s abs_logerror")
## integer(0)
plot(error.DATE, type="b") + title("date v.s logerror")
## integer(0)
barplot(table(train$trans_DATE)) + title("histogram of date")

## numeric(0)

```

abs_error.month = by(train, train$trans_month, function(x){return(mean(x$abs_logerror))})
error.month = by(train, train$trans_month, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.month, type="b") + title("month v.s abs_logerror")
## integer(0)
plot(error.month, type="b") + title("month v.s logerror")
## integer(0)
barplot(table(train$trans_month)) + title("histogram of month")

## numeric(0)
abs_error.day = by(train, train$trans_day, function(x){return(mean(x$abs_logerror))})
error.day = by(train, train$trans_day, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(abs_error.day, type="b") + title("day v.s abs_logerror")
## integer(0)
plot(error.day, type="b") + title("day v.s logerror")
## integer(0)
barplot(table(train$trans_day)) + title("histogram of day")

## numeric(0)
abs_error.weekday = by(train, train$trans_weekday, function(x){return(mean(x$abs_logerror))})
error.weekday = by(train, train$trans_weekday, function(x){return(mean(x$logerror))})
par(mfrow=c(3,1))
plot(sort(abs_error.weekday), type="b") + title("weekday v.s abs_logerror")
## integer(0)
plot(sort(error.weekday), type="b") + title("weekday v.s logerror")
## integer(0)
barplot(sort(table(train$trans_weekday))) + title("histogram of weekday")

## numeric(0)
abs_error.build_year = by(train, train$build_year, function(x){return(mean(x$abs_logerror))})
error.build_year = by(train, train$build_year, function(x){return(mean(x$logerror))})
err.build_year = data.frame(year = as.integer(sort(unique(train$build_year))),
                            abs_logerror = as.numeric(abs_error.build_year),
                            logerror = as.numeric(error.build_year))
ggplot(data=err.build_year, aes(x = year, y = abs_logerror)) + 
  geom_point() + geom_line() + geom_smooth() + geom_hline(yintercept = mean(train$abs_logerror), color="red")

ggplot(data=err.build_year, aes(x = year, y = logerror)) + 
  geom_point() + geom_line() + geom_smooth() + coord_cartesian(ylim=c(-0.1,0.1))+geom_hline(yintercept = mean(train$logerror), color="red")

abs_logerror, log_error ~ (variable_numeric, discrete) : linear relationship

par(mfrow=c(1,1))

remain.col <- names(num.NA)[which(num.NA <= 0.8 * dim(train)[1])] # trainT = train
remain.numeric <- remain.col[which(remain.col %in% variable_numeric)]
remain.discrete <- remain.col[which(remain.col %in% variable_discrete)]
train.remain <- train[, c("abs_logerror",remain.numeric)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # numeric

train.remain <- train[, c("abs_logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete

train.remain <- train[, c("abs_logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete

train.remain <- train[, c("logerror",remain.numeric)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # numeric

train.remain <- train[, c("logerror",remain.discrete)]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete

train.remain <- train[, c("logerror",c(remain.discrete, remain.numeric))]
corrplot(cor(train.remain, use = "complete.obs"), type="lower") # discrete

Discrete variables

df = train[, c("num_bath", "num_bedroom", "num_room")]
df.dropna = df[complete.cases(df),]
ggpairs(df.dropna,
        method = "square",
        tl.cex = 1,
        type = 'lower')

Quality variable

df = train[,c("quality","logerror")]
df$quality = as.integer(df$quality)

ggplot(data=df, aes(x=as.factor(quality), y=logerror)) + 
  geom_jitter(alpha=0.3, color='lightgrey') +
  geom_boxplot(outlier.color=NA, color='steelblue') + 
  labs(x='quality')

Visualization of Geographic Information

Show position of house on the map

# map on large abs_logerr
lower = quantile(train$abs_logerror, 0.1)
upper = quantile(train$abs_logerror, 0.9)

well_estimate = train[((train$abs_logerror < upper) & (train$abs_logerror > lower)),
                      c("longitude", "latitude", "abs_logerror")]
bad_estimate = train[((train$abs_logerror > upper) | (train$abs_logerror < lower)),
                     c("longitude", "latitude", "abs_logerror")]


over_est = quantile(train$logerror, 0.99)
over_estimate = train[(train$logerror > upper),
                      c("longitude", "latitude", "logerror")]
over_estimate$type = "over"

under_est = quantile(train$logerror, 0.01)
under_estimate = train[(train$abs_logerror < lower),
                     c("longitude", "latitude", "logerror")]
under_estimate$type = "under"

pal <- colorFactor(c("red","navy"), domain = c("over", "under"))

rbind(over_estimate, under_estimate) %>%
leaflet() %>%
  addTiles() %>%
  addCircleMarkers(lng=~longitude/10e5,
             lat=~latitude/10e5,
             color = ~pal(type),
             radius = 3,
             stroke = FALSE, fillOpacity = 0.3)
mapRegion = function(X, ub, lb){
  
  over_est = quantile(train$logerror, ub)
  over_estimate = X[(X$logerror > over_est),
                        c("longitude", "latitude", "logerror")]
  over_estimate$type = "over"
  
  under_est = quantile(train$logerror, lb)
  under_estimate = X[(X$abs_logerror < under_est),
                         c("longitude", "latitude", "logerror")]
  under_estimate$type = "under"
  
  pal <- colorFactor(c("red", "navy"), domain = c("over", "under"))
  
  rbind(over_estimate, under_estimate) %>%
    leaflet() %>%
    addTiles() %>%
    addCircleMarkers(lng=~longitude/10e5,
                     lat=~latitude/10e5,
                     color = ~pal(type),
                     radius = 5,
                     stroke = FALSE, fillOpacity = 0.5)
}

region = train[!is.na(train$region_city)&(train$region_city == "5534"),]
mapRegion(region, 0.9, 0.5)